import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
precision_score,
recall_score,
roc_auc_score,
confusion_matrix,
)
We are using missinginfo library to visualize missing rows in data.
import sys
!{sys.executable} -m pip install missingno
!{sys.executable} -m pip install sklearn_extra
import missingno as msno
The values in these thresholds are set wisely after so much experiments and cacluations. We'll justify these thresholds in detail in report.
survey_data=pd.read_csv("./data/data.csv",low_memory=False)
data_dictionaary = pd.read_excel('./data/Survey Data Dictionary.xlsx')
row_remove_threshold = 70 #we'll try differnnt threholds to see where we get better results for gender predction
drop_threshold = 60 #only columns with 60% Non-Empty values will be considered
chunk_size = 50 #for drawing graph of missing values
Most of the people don't fill survey data willingly or skip the input fields, so it's great idea to keep only quality survey data. We have reasonable number of rows 20K+. We are cleaning data based on rows first then we deal with columns.
#rows_nulls_percentage: contains the percentage of missing values in every row
rows_nulls_percentage = list((survey_data.isnull().sum(axis=1)/ len(survey_data.columns))*100)
#converting float to into int, so it would be easy to visualize the data
rows_nulls_percentage = [int(i) for i in rows_nulls_percentage]
y_axis = np.zeros(100)
#calculating rows missing percentages.For example row r1, r2, r3 has 1% missing values, then we are storing 3 against 1 percent
#so to say, X number of rows has x% of missing values
for perc in rows_nulls_percentage:
y_axis[perc] = y_axis[perc] + 1
for percent in range(1, 100):
print(y_axis[percent], "\tpeople skiped ",str(percent),"\b% (",str(((percent)*len(survey_data.columns))/100),") columns")
As described in above cell, we are calculating rows missing percentages.For example row r1, r2, r3 has 1% missing values, then we are storing 3 against 1 percent. So to say, X number of rows has x% of missing values
plt.bar(range(len(y_axis)),y_axis)
plt.title('Missing Rows Spread in percentage')
plt.ylabel("No. Of People/Rows")
plt.xlabel("Missing Percentage")
# more precise visualization of above graph
dd = pd.Index(rows_nulls_percentage)
p_c = (dd.value_counts(sort=False, ascending=True))
#print(p_c)
unique_percentages = dd.unique()
unique_percentages.sort_values()
y_a = []
for u_per in unique_percentages:
val = (dd[dd == u_per].value_counts())
y_a.append(int(val))
y_a,unique_percentages
plt.bar(unique_percentages, y_a)
plt.title('Missing Row values Spread in percentage')
plt.ylabel("No. Of People/Rows")
plt.xlabel("Missing Percentage")
for index in range(len(unique_percentages)):
print(y_a[index],"\tPeople skipped ",unique_percentages[index],"\b% fields")
In the below cell indeces of the rows that doesn't match our filter criteria are calculated and then rows are dropped.
survey_data=pd.read_csv("./data/data.csv",low_memory=False)
rows_indeces_to_remove = []
for index, perc in enumerate(rows_nulls_percentage):
if(perc>row_remove_threshold):
rows_indeces_to_remove.append(index)
print(len(rows_indeces_to_remove),"rows will be removed with threshold",row_remove_threshold,"\b%")
#print(rows_nulls_percentage)
survey_data.drop(survey_data.index[rows_indeces_to_remove], inplace=True)
survey_data.shape
Note: This cell executes with a waring and take about a minute to execute.
Every graph contains detals of missing cells for 50 colums(limitation of graph library). Black horizotal marker indicated a filled value otherwise a white space represents missing value.
cols = survey_data.columns
cols_length = len(cols)
for index in range(0,cols_length,chunk_size):
set = survey_data[cols[index:index+chunk_size]]
msno.matrix(set, inline=False)
data.csv hay 1235 columns but we have details of 1105 columns in dictionary. In the cell below, columns that are in our data.csv (UNCLEANED), but not dictionary. We didn't remove them just becuase they have missing, we handled missing columns nicely but after dropping columns those do not match our criteria.
cols_in_data = survey_data.columns
print("Columns in Data.csv", len(cols_in_data))
cols_in_dic = len(data_dictionaary)
print("Columns in Dictionary.xlsx", cols_in_dic)
columns_in_dictionary = list(data_dictionaary['Column Name'])
missing_cols_in_dic = []
for col in cols_in_data:
if col not in columns_in_dictionary:
#print(col)
missing_cols_in_dic.append(str(col))
print("Following are ",len(missing_cols_in_dic)," columns for which there's no information in dictionary:\n\n",missing_cols_in_dic)
#columns_in_dictionary
all_columns = survey_data.columns
data_size_N = survey_data.shape[0]
cols_with_missing_values_count = 0
complete_col_count = 0
null_col_count = 0
should_drop_count = 0
columns_to_drop = []
#count the columns where even 1 row is missing
for col in all_columns:
current_col = survey_data[col]
missing_values_count = current_col.isnull().sum()
non_empty_values_percentage = ((data_size_N - missing_values_count)/data_size_N)*100
if(missing_values_count==0):
complete_col_count+=1
if(missing_values_count==data_size_N):
null_col_count+=1
if non_empty_values_percentage < drop_threshold:
#print(non_empty_values_percentage, data_size_N,missing_values_count, col)
should_drop_count += 1
columns_to_drop.append(col)
#print(col)
if missing_values_count > 0:
cols_with_missing_values_count += 1
#print('Column ',col, ' has',missing_values_count,' missing values' )
remaining_columns_count = len(all_columns)-should_drop_count
print("Total ",cols_with_missing_values_count," columns have missing values not matching our criteria")
print("Total ",should_drop_count,"columns should be dropped with threshold",drop_threshold,"\b%")
print("Total columns with all filled values", complete_col_count, "\nTotal columns with all null values", null_col_count, "\nNumber of rows in data", data_size_N, "\nColumns will remain",remaining_columns_count )
len(columns_to_drop)
data_filtered = survey_data.drop(columns_to_drop, axis=1)
data_filtered.shape
It can be seen we are left with good enough data to do gender prediciton. If we didn't drop the rows and columns and fill the columsn with filling method, we might have introduced bias in our data, which is not good for features building for gender prediction model.
cols = data_filtered.columns
cols_length = len(cols)
for index in range(0,cols_length,chunk_size):
set = data_filtered[cols[index:index+chunk_size]]
msno.matrix(set, inline=False)
def drawBarChartOfTypes(data_filtered, title):
#print(data_filtered.applymap(type))
columns_types = data_filtered.dtypes.unique()
columns_types_counts = data_filtered.dtypes.value_counts()
barchart_x = []
barchart_y = []
for inedex, col_type_count in enumerate(columns_types_counts):
barchart_x.append(str(columns_types[inedex]))
barchart_y.append(col_type_count)
#print(columns_types[inedex], col_type_count)
bars = plt.bar(barchart_x, barchart_y)
plt.title(title+" , Total Columns: "+str(remaining_columns_count))
for bar in bars:
yval = bar.get_height()
plt.text(bar.get_x(), yval + .05, yval)
Counts of all unique data types for columns in data
drawBarChartOfTypes(data_filtered, 'Columns Types Counts')
Following cell prints the columns that are in our filtered data but we don't have any information about them in dictionary.
#these are not ALL missing columns but those we need but not in dictionary
columns_in_dictionary = list(data_dictionaary['Column Name'])
filtered_columns = data_filtered.columns
missing_columns_in_dictionary = []
for filtered_column in filtered_columns:
#print(filtered_column)
if filtered_column not in columns_in_dictionary:
missing_columns_in_dictionary.append(filtered_column)
(missing_columns_in_dictionary)
#columns_in_dictionary
data_filtered_corr_matrix = data_filtered.corr()
#ax = sns.heatmap(data_filtered_corr_matrix, annot=True, cmap="YlGnBu")
After calcuating correlation matrix, we noticed that size of columns correlation matrix is not equal to size of columns of filtered_data. After analyzing, we found that we need to handle object data types to include in correlation matrix.
Even the columns has type object they still contains numeric values except two obj columns LN2_RIndLngBEOth, LN2_WIndLngBEOth, we decided to drop them, becuase these columns are location information and has nothing to do with gender. More justification in report.
In the column FB16_6 we found an garbage value 'Gujraati' at index [10619], as per dictionary this column is supposed to have only numeric values. So assuming the data might not be filled nicely for whole row we removed the row.
#warning! this column included delteions, don't execute this column again and again
missing_correlation_columns = []
for filtered_column in filtered_columns:
if filtered_column not in data_filtered_corr_matrix:
missing_correlation_columns.append(filtered_column)
#we identified missing colums in the correlation and found that correlation matrix does not include for object types
print("Missing Columns in Correlation Matrix", missing_correlation_columns) #actually these are columsn with type object, we need to convert them to numeric
print("Object type columns",list(data_filtered.select_dtypes(['object']).columns))
#if we execute the data type conversion satement below, it throws an error, becuase two columns named 'LN2_RIndLngBEOth', 'LN2_WIndLngBEOth' are nominal, we don't have information about these columns in dictionary, so we decided to drop them from data
unknown_cols_to_drop = ['LN2_RIndLngBEOth', 'LN2_WIndLngBEOth']
for uk_col in unknown_cols_to_drop:
if uk_col in data_filtered:
data_filtered = data_filtered.drop([uk_col], axis=1)
if uk_col in missing_correlation_columns:
missing_correlation_columns.remove(uk_col)
print("Change type of", missing_correlation_columns)
#while executing the conversion statment we get following error
#'Unable to parse string "Gujarati" at position 10619', 'occurred at index FB16_6'
#after analyzing from dictionary we got to know this value not from an acceptable value domain. so we decided to remove the row
#if data_filtered.FB16_6[10619] == "Gujarati": #this can be thougt of garbage value
data_filtered.drop(data_filtered.index[10619], inplace=True)
print(data_filtered.shape)
data_filtered[missing_correlation_columns] = data_filtered[missing_correlation_columns].apply(pd.to_numeric)
Counts of all unique data types for columns in data after data type conversion
drawBarChartOfTypes(data_filtered, "Columns Types Count after changing data types Obj=> Numeric")
This time size of columns in correlation matrix matches with size of columns in data
#recompute correlation matrix
data_filtered_corr_matrix = data_filtered.corr()
print("Columns in correlation matrix", len(data_filtered_corr_matrix), "\nColumns in data", data_filtered.shape[1])
def drawCorrelationChart(col_name, data, title):
#ax = sns.heatmap(data, annot=True, cmap="YlGnBu")
#data = list(data)
y_axis = list(data[col_name].values)
#x_axis = list(data.columns)
#print(y_axis)
plt.bar(range(0,len(y_axis)), y_axis)
plt.title(title)
plt.xlabel('Column Number')
plt.ylabel('Correlation')
As per project requirement our task is to predict gender. For further data cleaning we want know the columns correlation with gender.
drawCorrelationChart("Gender",data_filtered_corr_matrix, "Correlation Before Filling")
For now, we decided to not drop the columns with low or negative corelation because we expect our Gender predictor [Next deliverable] to handle negative weights as well. So we are just writing a method to filter columns with threhold correlation, jus in case if we need them for future.
def getCorrelatedColumns(corr_matrix, threshold):
col_corr = []
gender_correlation = corr_matrix['Gender']
columns = corr_matrix.columns
correlated_columns = [ columns[index] for index, cor in enumerate(gender_correlation) if cor > threshold]
return correlated_columns
print(getCorrelatedColumns(data_filtered_corr_matrix, -0.3))
We seperated the dictionary containg information of all columns we have in filtered data, given the columns information is available in full dictionary.
#seperating the dictionary, keeping only necessary columns dictionary
data_dictionaary_filtered = pd.DataFrame(data_dictionaary) #initiazing with same but will remove unnecessary in following loop
rows_indexes_to_delete = []
for index in range(len(data_dictionaary)):
col_name = list(data_dictionaary_filtered['Column Name'])[index] #data_dictionaary.loc[ index , : ][0]
if col_name not in data_filtered.columns:
rows_indexes_to_delete.append(index)
print(len(rows_indexes_to_delete) ,len(data_filtered.columns))
data_dictionaary_filtered.drop(data_dictionaary_filtered.index[rows_indexes_to_delete], inplace=True)
len(data_dictionaary_filtered), data_dictionaary_filtered
data_dictionaary_filtered.to_excel (r'./data/filtered_dictionary.xlsx', engine='xlsxwriter')
We are doing filling in two steps:
Filling with default values: We noticed that there are few columns we can fill with their default values instead of filling with mean or mode so to introduce bias. So we filled them with defaulat values which is described as DK(99) in dictionary. When we do gender prediction, we might consider replacing 99 with other number may be median value. But for now we are filling with 99 as per dictionary.
Filling values with mode: For now we are filling the missing values with Mode, which mean considering the opinion of most of the people. We might revisit this method and consider filling with median instead of mode. i-e, 3 for domain values of [1,2,3,4,5]
data_dictionaary_filtered
columns_to_be_filled_with_dk = []
for index in range(len(data_dictionaary_filtered)):
domain_values = list(data_dictionaary_filtered['Values'])[index]#data_dictionaary_filtered.loc[ index , : ][2]
col_name = list(data_dictionaary_filtered['Column Name'])[index]#data_dictionaary.loc[ index , : ][0]
if '99=DK' in domain_values:
#print(domain_values,"\n")
#print(col_name)
columns_to_be_filled_with_dk.append(col_name)
print("Following",len(columns_to_be_filled_with_dk),"can be filled with default value",columns_to_be_filled_with_dk)
For now we are considering 99 as per dictionary, we might consider replacing 99 with median value of column domain values.
for col_name in columns_to_be_filled_with_dk:
#print(col_name)
data_filtered[col_name] = data_filtered[col_name].replace(np.nan, 99)
For now we are considering mode value by keeping the opinion/answer of the most of the people. We might consider replacing it with median of domain values.
#Rest to be filled with mode
for column in data_filtered.columns:
data_filtered[column].fillna(data_filtered[column].mode()[0], inplace=True)
Visualizatin of correlation with gender after filling. We can see there's not much bias after filling the data
data_filtered_corr_matrix_fil = data_filtered.corr()
drawCorrelationChart("Gender",data_filtered_corr_matrix, "Correlation Before Filling")
plt.figure()
drawCorrelationChart("Gender",data_filtered_corr_matrix_fil, "Correlation After Filling")
No value in the dat left unfilled
cols = data_filtered.columns
cols_length = len(cols)
for index in range(0,cols_length,chunk_size):
set = data_filtered[cols[index:index+chunk_size]]
msno.matrix(set, inline=False)
data_filtered.to_csv (r'./data/filtered_data.csv', index = False, header=True)
data_filtered=pd.read_csv("./data/filtered_data.csv",low_memory=False)
data_for_clustering = data_filtered.drop(['Gender'], axis=1)
Spliting data into Train and Test with 80/20 ratio.
Note :Test and Split is only for Classification, we are using whole data for Clustring
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data_filtered, test_size=0.20, stratify=data_filtered['Gender'])
y_train = train_data.Gender
y_test = test_data.Gender
X_test = test_data.drop('Gender', axis = 1)
X_train = train_data.drop('Gender', axis = 1)
X_all = data_for_clustering
y_all = data_filtered.Gender
len(X_train),len(X_test), len(X_all), len(y_all)
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
def printStats(algoResults):
print('\nAlgorithm\t|TPs\t|TNs\t|FPs\t|FNs\t\t|F1-Score\t|Accuracy\t|Precision\t|ROC/AUC','\n'+'================================================================================================================')
for algoResults_i in algoResults:
confusionMatrix = algoResults_i[1]
algoName = algoResults_i[0]
tp = int(confusionMatrix[0][0])
fp = int(confusionMatrix[0][1])
fn = int(confusionMatrix[1][0])
tn = int(confusionMatrix[1][1])
precision = tp/(tp+fp)
recall = tp/(tp+fn)
accuracy = (tp+tn)/(tp+tn+fp+fn)
f1Score = 2*((precision * recall) / (precision + recall))
roc_auc = algoResults_i[2]
dec_fig = 2
precision= round(precision, dec_fig)
recall= round(recall, dec_fig)
accuracy= round(accuracy,dec_fig )
f1Score= round(f1Score, dec_fig)
roc_auc= round(roc_auc, dec_fig)
precision= round(precision*100, dec_fig)
recall= round(recall*100, dec_fig)
accuracy= round(accuracy*100,dec_fig )
f1Score= round(f1Score*100, dec_fig)
roc_auc= round(roc_auc*100, dec_fig)
precision = str(precision)+"%"
recall = str(recall)+"%"
accuracy = str(accuracy)+"%"
f1Score = str(f1Score)+"%"
roc_auc = str(roc_auc)+"%"
print(algoName+'\t|'+str(tp)+'\t|'+str(tn)+'\t|'+str(fp)+'\t|'+str(fn)+'\t\t|'+str(f1Score)+' \t|'+str(accuracy)+' \t|'+str(precision)+' \t|'+str(roc_auc))
#print('------------------------------------------------------------------------------------------------------------------')
def getConfusionMatix(actualLabels, predictedLabels):
confusionMatrix = np.zeros([2,2])
for index, label in enumerate(actualLabels):
goldLabel = int(label)
predictedLabel = int(predictedLabels[index])
#if not goldLabel == predictedLabel:
#print('goldLabel', goldLabel, "predictedLabel",predictedLabel,'@',index)
if goldLabel == predictedLabel:
#print("Correct Prediction")
if(goldLabel):
confusionMatrix[0][0] = confusionMatrix[0][0] +1
#print("TP for")
else:
confusionMatrix[1][1] = confusionMatrix[1][1] +1
#print("TN for")
else:
if(goldLabel):
confusionMatrix[1][0] = confusionMatrix[1][0] +1
else:
confusionMatrix[0][1] = confusionMatrix[0][1] +1
return confusionMatrix
X_train_values = X_train.values
#X_train_values = StandardScaler().fit_transform(X_train_values)
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
def get_Kmeans_clusters_labels(data, k, n_init):
# Initialize
k_means = KMeans(init="k-means++", n_clusters=k, n_init=n_init)
k_means.fit(data)
return k_means.predict(data), k_means.inertia_
predicted_labels_kmeans, inertia = get_Kmeans_clusters_labels(X_all, 2, n_init=12)
ac = AgglomerativeClustering(n_clusters =2, affinity = 'euclidean', linkage = 'complete')
predicted_labels_hc_complete = ac.fit_predict(X_all)
ac = AgglomerativeClustering(n_clusters =2, affinity = 'euclidean', linkage = 'ward')
predicted_labels_hc_ward = ac.fit_predict(X_all)
dbscan_clustering = DBSCAN(eps=300, min_samples=100)
predicted_labels_dbscan = dbscan_clustering.fit_predict(X_all)
algoResults_clustering = []
#cm_rfc = getConfusionMatix(y_test,predictions )
#algoResults.append(['RFC\t', cm_rfc])
cm_keams = getConfusionMatix(y_test,predicted_labels_kmeans )
roc_score= roc_auc_score(y_all, predicted_labels_kmeans)
algoResults_clustering.append(['Kmeans\t', cm_keams, roc_score])
cm_ac_ward = getConfusionMatix(y_test,predicted_labels_hc_ward )
roc_score= roc_auc_score(y_all, predicted_labels_hc_ward)
algoResults_clustering.append(['Agglo-Ward', cm_ac_ward, roc_score])
cm_ac_complete = getConfusionMatix(y_test,predicted_labels_hc_complete )
roc_score= roc_auc_score(y_all, predicted_labels_hc_complete)
algoResults_clustering.append(['Agglo-Max', cm_ac_complete, roc_score])
cm_dbscan = getConfusionMatix(y_test,predicted_labels_dbscan )
roc_score= roc_auc_score(y_all, predicted_labels_dbscan)
algoResults_clustering.append(['DBScan\t', cm_dbscan, roc_score])
printStats(algoResults_clustering)
Clustering algorithm didn't perfom well to predict the gender. We can try some classfication and regression algorithm to get better results for gender prediciton.
We are using following three algorithms to improved gender prediction task.
def executeKNN(X_test, y_train, K):
knn_classifier = KNeighborsClassifier(n_neighbors=K)
knn_classifier.fit(X_train, y_train)
labels_knn = knn_classifier.predict(X_test)
return labels_knn
def get_feature_importance(model, features):
if not hasattr(model, "coef_") and not hasattr(model, "feature_importances_"):
raise Exception("Not possible to collect feature importances")
if hasattr(model, "coef_"):
model_feature_importances = model.coef_[0]
elif hasattr(model, "feature_importances_"):
model_feature_importances = model.feature_importances_
return [
(feature, importance)
for feature, importance in sorted(zip(features, model_feature_importances),
key=lambda pair: pair[1],
reverse=True,
)
]
def plotFeatureImportance(model, X_train, threshold= 0.00001):
fi = get_feature_importance(model, X_train.columns.values)
#print(fi)
pd_fi = pd.DataFrame(fi, columns=["feature", "importance"])
pd_fi = pd_fi[(pd_fi.importance >threshold )]
#print(pd_fi)
p = plt.barh(pd_fi["feature"], pd_fi["importance"])
rfc = RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1, criterion='entropy')
rfc.fit(X_train, y_train)
labels_rfc = rfc.predict(X_test)
plotFeatureImportance(rfc, X_train, threshold = 0.01)
logReg = LogisticRegression(max_iter = 100, random_state=0,penalty= 'l2',solver='saga')
logReg.fit(X_train, y_train)
labels_logReg_l2 = logReg.predict(X_test)
plotFeatureImportance(logReg, X_train, )
#max_iterations are set high intentionally,becuase data has large number of columns.
#so, model should be given enough iterations to complete training process
logReg = LogisticRegression(max_iter = 100, random_state=0,penalty= 'l2',solver='saga')
logReg.fit(X_train, y_train)
labels_logReg_l1 = logReg.predict(X_test)
plotFeatureImportance(logReg, X_train)
classification_results = []
cm_rfc = getConfusionMatix(y_test,labels_rfc )
roc_score= roc_auc_score(y_test, labels_rfc)
classification_results.append(['RFC\t',cm_rfc, roc_score ])
cm_logReg_l1 = getConfusionMatix(y_test,labels_logReg_l1 )
roc_score= roc_auc_score(y_test, labels_logReg_l1)
classification_results.append(['LogReg-L1',cm_logReg_l1,roc_score ])
cm_logReg_l2 = getConfusionMatix(y_test,labels_logReg_l2 )
roc_score= roc_auc_score(y_test, labels_logReg_l2)
classification_results.append(['LogReg-L2',cm_logReg_l2,roc_score ])
k = [3, 7, 10]
for k_i in k:
labels_knn =executeKNN(X_test, y_train, k_i)
cm_knn = getConfusionMatix(y_test,labels_knn )
roc_score= roc_auc_score(y_test, labels_knn)
classification_results.append(['KNN-'+(str(k_i))+'\t', cm_knn, roc_score])
printStats(classification_results)
Just displaying the patterns assiciate with gender with columns identifided as important columns[highly weighted features: those play major role in prediction] above while training RFC, LogRegression models.
table = pd.crosstab(index=data_filtered["MT6"],
columns=data_filtered["Gender"])
table
fig = plt.figure()
fig = table.plot(kind="bar",
figsize=(8,8),
stacked=True)
plt.xlabel('MT6', fontsize=18)
plt.ylabel('counts', fontsize=16)
plt.title('MT6 with respect to Gender')
"""
This variable shows a clear differentiation between the two genders. The males have bought phones by
themselves most of the time. Some other person have been purchased it for females in many occasions.
"""
#MT6B - Where you charge the battery
ax = sns.catplot(x='MT6B',kind='count',data=data_filtered,orient="h", hue = 'Gender')
ax.fig.autofmt_xdate()
table = pd.crosstab(index=data_filtered["MT6B"],
columns=data_filtered["Gender"])
table
"""
Most of the people have been charging it either from home or don't know exactly. It seems like most of the
males do have a exact place to charge their phone and for most of the females, they don't have a idea about that.
When looking at the detailes more closely, a very less number of people charge their phones at work. There are even 14
people who charge their phones at retail shops.
"""
#MT6A - How frequently charge your phone
ax = sns.catplot(x='MT6A',kind='count',data=data_filtered,orient="h", hue = 'Gender')
ax.fig.autofmt_xdate()
"""
People either charge their phones daily or don't exactly know in most of the times.
Most of the females don't know about charging the phone or they don't keep a track about it it seems.
"""
"""
When these two variables are combined, it can be seen that the people who charge their phones frequently are
charging it at their homes while the people who don't know about charging don't know where it's been charging.
This explains the integrity of survey data too.
"""
table = pd.crosstab(index=data_filtered["MT6C"],
columns=data_filtered["MT6A"])
table.plot(kind="bar",
figsize=(8,8),
stacked=True)
plt.xlabel('MT6C', fontsize=12)
plt.ylabel('counts', fontsize=12)
plt.title('MT6A with respect to MT6C')
#MT5 - purpose to obtain a Mobile phone
table = pd.crosstab(index=data_filtered["MT5"],
columns=data_filtered["Gender"])
table.plot(kind="bar",
figsize=(8,8),
stacked=True)
plt.xlabel('MT5', fontsize=18)
plt.ylabel('counts', fontsize=16)
plt.title('MT5 with respect to Gender')
"""
Both males and females have been obtained the mobile phone for nearly same reasons.
There is no clear difference between the gender when it comes to the purpose of buying it.
"""
#MT6C - cost each time to charge your phone’s
sns.catplot(x="Gender",y="MT6C",data=data_filtered)
"""
The cost of charge is a constant for any person. It does seems to vary much with the gender.
"""
#DG1 - Birth year
sns.catplot(x="Gender",y="DG1",data=data_filtered, kind = 'box')
"""
Most of the people who participated the survey are born in the years from 1970 - 1990. There are fe people who
have born before 1940. They cannot considered as outliers since these are possible data points.
"""
#IFI17_2 - How easy to go to the nearest ATM
table = pd.crosstab(index=data_filtered["IFI17_2"],
columns=data_filtered["Gender"])
table.plot(kind="bar",
figsize=(8,8),
stacked=True)
plt.xlabel('IFI17_2', fontsize=18)
plt.ylabel('counts', fontsize=16)
plt.title('IFI17_2 with respect to Gender')
"""
THis variable does not seems to affect with the gender. The difficulty level is mostly similar
for both genders
"""
Few of the features are common. For distinct features patterns are displayed below.
#GN1 - who decides how the money you earn will be used
table = pd.crosstab(index=data_filtered["GN1"],
columns=data_filtered["Gender"])
table.plot(kind="bar",
figsize=(8,8),
stacked=True)
plt.xlabel('GN1', fontsize=18)
plt.ylabel('counts', fontsize=16)
plt.title('GN1 with respect to Gender')
"""
According this plot, it is clear that the most of males decide about spending my by themselves.
But for females its different. The decisions are taken by the spouse only for less number of males
with compared to the females in the same category.
"""
All but one feautes are common.
#DL0 - The main income owner of the household
table = pd.crosstab(index=data_filtered["DL0"],
columns=data_filtered["Gender"])
table.plot(kind="bar",
figsize=(8,8),
stacked=True)
plt.xlabel('DL0', fontsize=12)
plt.ylabel('counts', fontsize=12)
plt.title('DL0 with respect to Gender')
"""
#Most of the people fall in to the categories 'myself' and 'somebody else' are males and females respectively.
#It implies that, males are the main source of income in the households most of the time. For females,
#its somebody other than them
"""
#DG6 - relation with the household head
table = pd.crosstab(index=data_filtered["DG6"],
columns=data_filtered["Gender"])
table.plot(kind="bar",
figsize=(8,8),
stacked=True)
plt.xlabel('DG6', fontsize=12)
plt.ylabel('counts', fontsize=12)
plt.title('DG6 with respect to Gender')
"""
Most of the peole answered the survey are the head of the household. Among them, the most are males.
Out of the people fall in to the second category (spouse), most of them are females.
This implies that the head of the most households are Males.
"""
#DL1 - WORKING STATUS
table = pd.crosstab(index=data_filtered["DL1"],
columns=data_filtered["Gender"])
table.plot(kind="bar",
figsize=(8,8),
stacked=True)
plt.xlabel('DL1', fontsize=12)
plt.ylabel('counts', fontsize=12)
plt.title('DL1 with respect to Gender')
"""
Most of the people who were working full time for a regular salary are males while most of the people
who stayed at the home are females. Most of the working peaple are also males.
This clearly shows that, males are working and earning money with compared to the females.
"""
#MT1A - Decisions about having a phone
table = pd.crosstab(index=data_filtered["MT1A"],
columns=data_filtered["Gender"])
table.plot(kind="bar",
figsize=(8,8),
stacked=True)
plt.xlabel('MT1A', fontsize=12)
plt.ylabel('counts', fontsize=12)
plt.title('MT1A with respect to Gender')
"""
Most of the males have decide by themselves to have a phone whereas for females
it has been decided by her spouse most of the time. None of the sisters have decide it
for any one whereas the parents have decided it for both males and females in neary equal proportions.
"""
#FL4 - On who you do depend for most financial advices
ax = sns.catplot(x='FL4',kind='count',data=data_filtered,orient="h", hue = 'Gender')
ax.fig.autofmt_xdate()
"""
Most of the males depends on themselves when it comes to financial advices. Females depends ond their spouses.
Earlier it was found that the main source of income for most households are males. This might be the reason for
females to depend on their spouse.
"""
#DG3 - On who you do depend for most financial advices
ax = sns.catplot(x='DG3',kind='count',data=data_filtered,orient="h", hue = 'Gender')
ax.fig.autofmt_xdate()
"""
The number of single men are higher with compared to the number of females. The number of Monogamously married
men are also higher than the number of females. From the people in the category widow, least of them are males.
"""
#GN5 - Who decides what kind of financial services you can personally use
table = pd.crosstab(index=data_filtered["GN5"],
columns=data_filtered["Gender"])
table.plot(kind="bar",
figsize=(8,8),
stacked=True)
plt.xlabel('GN5', fontsize=12)
plt.ylabel('counts', fontsize=12)
plt.title('GN5 with respect to Gender')
"""
Most of the people among the people who decided for themselves are males. The decisons have been taken by the spouse for
most of th females with comapred to the males. There are a considereable number of people who take their decisions
jointly
"""
#GN5 - Who decides what kind of financial services you can personally use
table = pd.crosstab(index=data_filtered["GN5"],
columns=data_filtered["Gender"])
table.plot(kind="bar",
figsize=(8,8),
stacked=True)
plt.xlabel('GN5', fontsize=12)
plt.ylabel('counts', fontsize=12)
plt.title('GN5 with respect to Gender')
"""
Most of the people among the people who decided for themselves are males. The decisons have been taken by the spouse for
most of th females with comapred to the males. There are a considereable number of people who take their decisions
jointly
"""
#AA3 - Zone
ax = sns.catplot(x='AA3',kind='count',data=data_filtered,orient="h", hue = 'Gender')
ax.fig.autofmt_xdate()
y = data_filtered['Gender']
X = data_filtered.drop(['Gender'], axis = 1)
import statsmodels.api as sm
#Backward Elimination
cols = list(X.columns)
pmax = 1
while (len(cols)>0):
p= []
X_1 = X[cols]
X_1 = sm.add_constant(X_1)
model = sm.OLS(y,X_1).fit()
p = pd.Series(model.pvalues,index = cols)
pmax = max(p)
feature_with_p_max = p.idxmax()
if(pmax>0.001):
cols.remove(feature_with_p_max)
else:
break
selected_features_BE = cols
print(selected_features_BE)
#50 another features were selected by backward elimination
len(selected_features_BE)
#Features which are not in the data description (can be eliminated)
#AA7, AA4
#dg4 - education level
#DL0 - INCOME SOURCE
#
#DRop data without a description
df = data_filtered.drop(['AA7','AA4'], axis = 1)
import itertools
import scipy.stats as ss
cols = ['AA3', 'DG3', 'DG4', 'DG5_4', 'DG5_5', 'DG5_6', 'DL0', 'DL4_3', 'DL4_5', 'DL4_6', 'DL4_18', 'DL5', 'DL14', 'DL26_5', 'MT1', 'MT4_1', 'MT4_2', 'MT4_5', 'MT6', 'MT10', 'MT17_3', 'MT17_5', 'MT18_1', 'MT18_3', 'MT18_4', 'MT18_5', 'MT18_8', 'FF2', 'MM1', 'MMP1_8', 'IFI1_1', 'IFI3_3', 'IFI1_5', 'IFI17_2', 'FL1', 'FL4', 'FL6_2', 'FL13', 'FL14', 'FB16_6', 'FB26_10', 'FB27_1', 'LN2_4', 'GN1', 'GN2']
def cramers_corrected_stat(confusion_matrix):
""" calculate Cramers V statistic for categorical-categorical association.
uses correction from Bergsma and Wicher,
Journal of the Korean Statistical Society 42 (2013): 323-328
"""
chi2 = ss.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum().sum()
phi2 = chi2/n
r,k = confusion_matrix.shape
phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
rcorr = r - ((r-1)**2)/(n-1)
kcorr = k - ((k-1)**2)/(n-1)
return np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))
cols = cols
corrM = np.zeros((len(cols),len(cols)))
# there's probably a nice pandas way to do this
for col1, col2 in itertools.combinations(cols, 2):
idx1, idx2 = cols.index(col1), cols.index(col2)
corrM[idx1, idx2] = cramers_corrected_stat(pd.crosstab(df[col1], df[col2]))
corrM[idx2, idx1] = corrM[idx1, idx2]
corr = pd.DataFrame(corrM, index=cols, columns=cols)
fig, ax = plt.subplots(figsize=(25, 20))
ax = sns.heatmap(corr, annot=True, ax=ax); ax.set_title("Cramer V Correlation between Variables");
we can remove them to visualize other variables easily
cols = [ 'Gender','DG3', 'DG4', 'DG5_4', 'DG5_5', 'DG5_6', 'DL0', 'DL4_3', 'DL4_5', 'DL4_6', 'DL4_18', 'DL5', 'MT1', 'MT4_1', 'MT4_2', 'MT4_5', 'MT6', 'MT10', 'MT17_3', 'MT17_5', 'MT18_1', 'MT18_3', 'MT18_4', 'MT18_8', 'FF2', 'MM1', 'IFI3_3', 'IFI17_2', 'FL4', 'FL6_2', 'FL13', 'FL14', 'FB16_6', 'FB26_10', 'LN2_4', 'GN1', 'GN2']
cols = cols
corrM = np.zeros((len(cols),len(cols)))
# there's probably a nice pandas way to do this
for col1, col2 in itertools.combinations(cols, 2):
idx1, idx2 = cols.index(col1), cols.index(col2)
corrM[idx1, idx2] = cramers_corrected_stat(pd.crosstab(df[col1], df[col2]))
corrM[idx2, idx1] = corrM[idx1, idx2]
corr = pd.DataFrame(corrM, index=cols, columns=cols)
fig, ax = plt.subplots(figsize=(28, 28))
ax = sns.heatmap(corr, annot=True, ax=ax); ax.set_title("Cramer V Correlation between Variables");
are as follows
-GN1 -GN2 -FL4 -MT6 -DL0
DL5 (source of income) and DL4_18, DL4_3, DL4_5, DL4_6 are highly correlated. SO, DL4_18, DL4_3, DL4_5, DL4_6 are categories of DL5. so we can remove those variable keeping the DL5 variable.
cols = [ 'Gender','DG3', 'DG4', 'DG5_4', 'DG5_5', 'DG5_6', 'DL0', 'DL5', 'MT1', 'MT4_1', 'MT4_2', 'MT4_5', 'MT6', 'MT10', 'MT17_3', 'MT17_5', 'MT18_1', 'MT18_3', 'MT18_4', 'MT18_8', 'FF2', 'MM1', 'IFI3_3', 'IFI17_2', 'FL4', 'FL6_2', 'FL13', 'FL14', 'FB16_6', 'FB26_10', 'LN2_4', 'GN1', 'GN2']
cols = cols
corrM = np.zeros((len(cols),len(cols)))
# there's probably a nice pandas way to do this
for col1, col2 in itertools.combinations(cols, 2):
idx1, idx2 = cols.index(col1), cols.index(col2)
corrM[idx1, idx2] = cramers_corrected_stat(pd.crosstab(df[col1], df[col2]))
corrM[idx2, idx1] = corrM[idx1, idx2]
corr = pd.DataFrame(corrM, index=cols, columns=cols)
fig, ax = plt.subplots(figsize=(28, 28))
ax = sns.heatmap(corr, annot=True, ax=ax); ax.set_title("Cramer V Correlation between Variables");
Also browsing internet and having a touch screen is seems to be highly related. GN1 and GN2 are correlated. The person who decides on use of mney decides about the purchases made for the house hold also. Hence, GN1 describes the variation describes by GN2 as well. Hence. one variable can be removed.
#removing
removing_cols = ['DL4_18', 'DL4_3', 'DL4_5', 'DL4_6','GN2', 'MT4_5']
#also we can add the variables we identified earlier by feature selction to identify correlations
cols = [ 'Gender','DG3', 'DG4', 'DG5_4', 'DG5_5', 'DG5_6', 'DG6','DL1',
'DL0', 'DL5', 'MT1', 'MT1A','MT4_1', 'MT4_2', 'MT6','MT6B','MT6A','MT6C','MT5', 'MT10',
'MT17_3', 'MT17_5', 'MT18_1', 'MT18_3', 'MT18_4', 'MT18_8',
'FF2', 'MM1', 'IFI3_3', 'IFI17_2', 'FL4', 'FL6_2', 'FL13', 'FL14',
'FB16_6', 'FB26_10', 'LN2_4', 'GN1', 'GN3', 'GN5']
cols = cols
corrM = np.zeros((len(cols),len(cols)))
# there's probably a nice pandas way to do this
for col1, col2 in itertools.combinations(cols, 2):
idx1, idx2 = cols.index(col1), cols.index(col2)
corrM[idx1, idx2] = cramers_corrected_stat(pd.crosstab(df[col1], df[col2]))
corrM[idx2, idx1] = corrM[idx1, idx2]
corr = pd.DataFrame(corrM, index=cols, columns=cols)
fig, ax = plt.subplots(figsize=(28, 28))
ax = sns.heatmap(corr, annot=True, ax=ax); ax.set_title("Cramer V Correlation between Variables");
DL0 and DG6 GN5 and GN6
Variables associated with gender
-GN1 -FL4 -MT6 -MT1A -DL0 -DL1 -DG6 -FL4 -GN5 -GN3